clear
set more off

*-------------------------------------------------------------------------------------------------------------
* PRELIMINARY STEPS
*-------------------------------------------------------------------------------------------------------------


//create folder for temporary files
capture mkdir replication_data/temporary_files


*** CPI
import delimited replication_data/CPI_UK.csv, clear
keep year cpi
drop if year == .
save replication_data/temporary_files/cpi.dta, replace


*** ICT
foreach var in 	Kq_IT Kq_CT Kq_Soft_DB Kq_TraEq Kq_OMach Kq_GFCF {
	clear
	import excel using replication_data/UK_capital_17i.xlsx, sheet("`var'") firstrow
	drop if code == "TOT" | code == "MARKT"
	encode code, gen(euklems_num)
	reshape long `var', i(euklems_num) j(year) 
	drop if year<1997
	save replication_data/temporary_files/file_`var', replace
}


import excel using replication_data/UK_output_17i.xlsx, sheet("EMPE") firstrow clear
drop if code == "TOT" | code == "MARKT"
encode code, gen(euklems_num)
reshape long EMPE, i(euklems_num) j(year) 
drop if year<1997
save replication_data/temporary_files/file_EMPE, replace


use "replication_data/temporary_files/file_Kq_IT.dta", clear
foreach var in Kq_CT Kq_Soft_DB Kq_TraEq Kq_OMach Kq_GFCF EMPE {
	merge 1:1 euklems_num year using replication_data/temporary_files/file_`var'.dta , generate(match_`var')
}

rename euklems_num euklems_pre
recode euklems_pre 20=1 21=2 22=3 1=4 2=5 3=6 4=7 5=8 6=9 7=10 8=11 9=12 ///
10=13 11=14 23=15 24=16 25=17 12=18 13=19 14=20 26=21 15=22 16=23 27=24 ///
28=25 17=26 18=27 19=28 29=29 30=30 31=31 33=32 32=33 34=34 35=35 37=36 ///
36=37 38=38 39=39 40=40, gen(euklems_num)

// generate ICT variables
gen ICT = (Kq_IT + Kq_CT + Kq_Soft_DB) / EMPE
//set ICT to missing for industries "Households as employers" (39) and "Extraterritorial org." (40) as there is no variation in ICT (always zero)
label var ICT "ICT capital stock (in 1000 GBP per worker)"

//machinery (non-ICT) per employee: transport + other machinery equipment and weapons
gen machinery = (Kq_TraEq + Kq_OMach) / EMPE
label var machinery "Non-ICT machinery capital stock (transport + other machinery equipment and weapons)"

//nonICT per employees
gen nonICT = (Kq_GFCF-(Kq_IT + Kq_CT + Kq_Soft_DB)) / EMPE
label var nonICT "Non-ICT capital stock (in 1000 GBP per worker)"

save replication_data/temporary_files/merged_euklems, replace

foreach var in Kq_IT Kq_CT Kq_Soft_DB	{
	clear
	import excel using replication_data/US_capital_17i.xlsx, sheet("`var'") firstrow
	drop if code == "TOT" | code == "MARKT"
	encode code, gen(euklems_num)
	reshape long `var', i(euklems_num) j(year) 
	drop if year<1997
	save replication_data/file_`var'_US, replace
}


use "replication_data/file_Kq_IT_US.dta"
foreach var in Kq_CT Kq_Soft_DB {
merge 1:1 euklems_num year using replication_data/file_`var'_US.dta , generate(match_`var')
}


//Labels 
rename euklems_num euklems_pre
recode euklems_pre 20=1 21=2 22=3 1=4 2=5 3=6 4=7 5=8 6=9 7=10 8=11 9=12 ///
10=13 11=14 23=15 24=16 25=17 12=18 13=19 14=20 26=21 15=22 16=23 27=24 ///
28=25 17=26 18=27 19=28 29=29 30=30 31=31 33=32 32=33 34=34 35=35 37=36 ///
36=37 38=38 39=39 40=40, gen(euklems_num)



*** ICT USA
foreach var in Kq_IT Kq_CT Kq_Soft_DB	{
	import excel using replication_data/US_capital_17i.xlsx, sheet("`var'") firstrow clear
	drop if code == "TOT" | code == "MARKT"
	encode code, gen(euklems_num)
	reshape long `var', i(euklems_num) j(year) 
	drop if year<1997
	save replication_data/file_`var'_US, replace
}


use "replication_data/file_Kq_IT_US.dta"
foreach var in Kq_CT Kq_Soft_DB {
	merge 1:1 euklems_num year using replication_data/file_`var'_US.dta , generate(match_`var')
}


//Labels 

rename euklems_num euklems_pre
recode euklems_pre 20=1 21=2 22=3 1=4 2=5 3=6 4=7 5=8 6=9 7=10 8=11 9=12 ///
10=13 11=14 23=15 24=16 25=17 12=18 13=19 14=20 26=21 15=22 16=23 27=24 ///
28=25 17=26 18=27 19=28 29=29 30=30 31=31 33=32 32=33 34=34 35=35 37=36 ///
36=37 38=38 39=39 40=40, gen(euklems_num)

gen ICT_nummerator_USA = (Kq_IT + Kq_CT + Kq_Soft_DB)
replace ICT_nummerator_USA = . if inlist(euklems_num, 39, 40)
keep ICT_nummerator_USA year euklems_num
save replication_data/merged_euklems_USA.dta, replace


*** TRADE 
//data from comtrade, Dorn crosswalked to US SIC, further crosswalked to euklems_num based on code file dorn-NACE2.R
import delimited "replication_data/trade_euklems_num.csv", clear
rename imports importsoriginal
gen imports = importsoriginal / 1000000
label variable imports "Imports summed per NACE rev2 group (only 2-19, tradable), in Millions."
drop importsoriginal
save "replication_data/temporary_files/imports.dta", replace


*** Routine Task Intensity (RTI) through O*NET
use "replication_data/onet2003-isco88.dta", replace
keep isco88* rti3
rename isco88 isco
rename rti3 rti

//take averages isco4 to isco3
gen isco88_3d = int(isco/10)
collapse (mean) rti, by(isco88_3d)
rename rti rti_3d
save "replication_data/temporary_files/merge_onet2003-isco88_3d.dta", replace


*-------------------------------------------------------------------------------------------------------------
* BHPS & UKHLS: Create individual-level panel 1991-2017
*-------------------------------------------------------------------------------------------------------------
* bhps 1991-2009 (harmonised)
* ukhls 2010-2017

* SN 6614 UK Data Archive
* Understanding Society: Waves 1-8, 2009-2017 and Harmonised BHPS: Waves 1-18, 1991-2009


* Create balanced data set (add variables that are only part of some single waves)
* Can be run as many times as needed, adds all potentially missing vars



* full list of all variables (except ID) used in either of both (or both) data sets
* create empty variable if missing.


*** define variables used from BHPS & UKLHS
local all_vars ///
	sex  birthy  age_dv yr2uk4 gor_dv /// /*socio-demographics*/
	jbstat  jbhrs jshrs paynu_dv /// /* job and wage */
	hiqual_dv /// /*education*/
	jbisco88_cc  /// /*occupation*/
	jbsic  jbsic92 jbsic07_cc /// /*industry*/
	vote1  vote2  vote3  vote4  vote5  vote6  vote7  vote8 /// /*politics*/
	lfsato  sclfsato /// /* life satisfaction */
	opsocc  opsoce   /// /* Supports Government Intervention */
	scopfama  scopfamb  scopfamd  /// /* Social progessivenes */
	istrtdaty  istrtdatd   istrtdatm  intdaty_dv  intdatm_dv  intdatd_dv /// /* Interview date */


// create folder for temporary files
capture mkdir "replication_data/temporary_files/bhps_ukhls"
	
	
*** BHPS
forval i = 1/18 {
	local x: word `i' of a b c d e f g h i j k l m n o p q r
	use "replication_data/bhps_ukhls/bhps_w`i'/b`x'_indresp", clear
	rename b`x'_* *
	
	foreach var in `all_vars' {
		capture confirm var `var', exact  //test if var exist
		if c(rc) == 111 { // if variable not found, create it as missing			
			gen `var' = .
			di "`var' was missing in wave `x' of BHPS, now created as missing"
		}
	}
	
	
	keep pidp `all_vars'
	gen wave=strpos("abcdefghijklmnopqr", "`x'")
	save "replication_data/temporary_files/bhps_ukhls/b`x'_indresp", replace
}


*** Understanding Society (UKHLS): Wave 1, 2009 (a) - Wave 8, 2017 (h)
forval i = 1/8 {
	local x: word `i' of a b c d e f g h 
	use "replication_data/bhps_ukhls/ukhls_w`i'/`x'_indresp", clear
	rename `x'_* *
	
	foreach var in `all_vars' {
		capture confirm var `var', exact //test if var exists
		if c(rc) == 111 { // if variable not found, create it as missing
			gen `var' = .
			di "`var' was missing in wave `x' of US, now created as missing"
		}
	}
	
	gen wave=strpos("abcdefghijklmnopqr", "`x'")+18
	keep pidp `all_vars'
	save "replication_data/temporary_files/bhps_ukhls/`x'_indresp", replace
}
	

// append all
use "replication_data/temporary_files/bhps_ukhls/ba_indresp.dta", clear
foreach x in bb bc bd be bf bg bh bi bj bk bl bm bn bo bp bq br a b c d e f g h {
append using "replication_data/temporary_files/bhps_ukhls/`x'_indresp.dta"
}

foreach var in `all_vars' {
	replace `var' = . if `var' < 0 // replace coded missings (<0) with .
	}

*-------------------------------------------------------------------------------------------------------------
* SOCIOECONOMIC & DEMOGRAPHIC VARIABLES
*-------------------------------------------------------------------------------------------------------------


*** ID
rename pidp id
format id %16.0f


*** WAVE
replace wave = wave + 1990


*** REGION
gen region = gor_dv
replace region=. if gor_dv<0
label var region "Government region ID"
label define region 1 "North East" 2 "North West" 3 "Yorkshire/Humber" 4 "East Midlands" 5 "West Midlands" 6 "East of England" 7 "London" 8 "South East" 9 "South West" 10 "Wales" 11 "Scotland" 12 "Northern Ireland" 13 "Channel Islands" 
label values region region


*** YEAR
rename istrtdaty year // create year based on the interview
replace year = intdaty_dv if year == .
replace year = 1991 if year == . & wave == 1991 
label variable year "Year"

rename istrtdatm interview_month 
replace interview_month = intdatm_dv if interview_month == .
rename istrtdatd interview_day
replace interview_day = intdatd_dv if interview_day == .

// create exact date of interview in stata format
gen interview_date = mdy(interview_month, interview_day, year)


*** AGE
gen age = age_dv
replace age= year - birthy if age_dv<0
gen agesq=age*age
label variable age "Age"
label variable agesq "Age squared"


*** GENDER
gen female = sex-1
replace female= . if sex<0
label var female "Dummy=1 if person identifies as female"

forvalues val = 1(1)3 {
	by id (year), sort: replace female = female[_n - `val'] if missing(female)
	by id (year), sort: replace female = female[_n + `val'] if missing(female)
	}

	
*** EDUCATION
gen edu = .
replace edu = 0 if hiqual_dv == 9
replace edu = 1 if hiqual_dv == 5
replace edu = 2 if hiqual_dv == 4
replace edu = 3 if hiqual_dv == 3
replace edu = 4 if hiqual_dv == 2
replace edu = 5 if hiqual_dv == 1
label variable edu "Highest Degree, harmonised (hiqual_dv)"
label define edu 0 "No Qualification" 1 "Other Qualification" 2 "GCSE etc" 3 "A-Level etc" 4 "Other higher degree" 5 "Degree"
label values edu edu
// impute missing
forvalues val = 1(1)10 {
by id (year), sort: replace edu = edu[_n - `val'] if edu ==.
by id (year), sort: replace edu = edu[_n + `val'] if missing(edu) & age >= 30
}


*** INCOME
gen incomem = paynu
replace incomem = . if paynu<0

label variable incomem "Monthly income (paynu_dv: Usual net pay per month: current job)"
label values incomem incomem

//Deflate wages to 2010 prices
merge m:1 year using replication_data/temporary_files/cpi.dta, gen(merge_cpi)
drop if merge_cpi == 2
drop merge_cpi

// deflate income variable
rename incomem incomem_nominal
gen incomem =  incomem_nominal * cpi /100
label var incomem_nominal "Usual net pay per month (current job) in current prices"
label var incomem "Monthly Net Pay"


*** HOURLY WAGE
replace jbhrs = jshrs if jbhrs == . // use self employed hours if no other hours (no overlap between the two)
gen hourly_wage = incomem / (jbhrs * 52 / 12)  if jbhrs >= 20 //assume four weeks pcm, only consider those with at least half-time employment
label var hourly_wage "Hourly net wage"


*** UNEMPLOYED
recode jbstat (-9/-1=.) (97=.) (1=0) (2=0) (3=1) (4=.) (5=0) (6=.) (7=.) (8=.) (9=.) (10=0) (11=0), gen(unemployed)
replace unemployed = unemployed * 100 //change to percentage points
label variable unemployed "Unemployed out of active population"


*** MIGRANT STATUS
gen immigrant = 0
replace immigrant = 1 if yr2uk4 != .
label var immigrant "Born outside the UK"
br yr2uk4 immigrant


*-------------------------------------------------------------------------------------------------------------
* POLITICAL VARIABLES
*-------------------------------------------------------------------------------------------------------------


*** TURNOUT
gen voted = .
replace voted=1 if vote7==1
replace voted=0 if vote7==2
label var voted "Turnout"


*** CONSERVATIVES
gen cons=.
replace cons = 1 if vote4 == 1 
replace cons = 1 if vote3 == 1 & cons ==.
replace cons = 0 if vote4 != 1 & vote4 != .
replace cons = 0 if vote3 != 1 & vote3 != . & cons ==.
replace cons = 1 if vote8 == 1 & cons==.
replace cons = 0 if vote8 != 1 & cons==. & vote8>0 & vote8 != .
label var cons "Conservatives"


*** LABOUR
gen labour=.
replace labour = 1 if vote4 == 2 
replace labour = 1 if vote3 == 2 & labour ==.
replace labour = 0 if vote4 != 2 & vote4 != .
replace labour = 0 if vote3 != 2 & vote3 != . & labour ==.
replace labour = 1 if vote8 == 2 & labour ==.
replace labour = 0 if vote8 != 2 & labour ==. & vote8>0 & vote8 != .
label var labour "Labour"


*** UKIP
gen ukip=.
replace ukip = 1 if vote4 == 12 
replace ukip = 1 if vote3 == 12 & ukip ==.
replace ukip = 0 if vote4 != 12 & vote4 != .
replace ukip = 0 if vote3 != 12 & vote3 != . & ukip ==.
replace ukip = 1 if vote8 == 12 & ukip ==.
replace ukip = 0 if vote8 != 12 & ukip ==. & vote8>0 & vote8 != .
replace ukip = . if year < 2013
label variable ukip "UKIP"


*** LIBDEM
gen libdem=.
replace libdem = 1 if vote4 == 3 
replace libdem = 1 if vote3 == 3 & libdem ==.
replace libdem = 0 if vote4 != 3 & vote4 != .
replace libdem = 0 if vote3 != 3 & vote3 != . & libdem ==.
replace libdem = 1 if vote8 == 3 & libdem ==.
replace libdem = 0 if vote8 != 3 & libdem ==. & vote8>0 & vote8 != .
label variable libdem "Liberal Democratic Party"


*** INCUMBENT
gen incumbent = .
replace incumbent = labour if interview_date < mdy(05, 10, 2010) //labor gvt
replace incumbent = 1 if (cons == 1 | libdem == 1 ) & interview_date >= mdy(05, 10, 2010) & interview_date < mdy(05, 07, 2015) //coaltion
replace incumbent = 0 if (cons == 0 & libdem == 0 ) & interview_date >= mdy(05, 10, 2010) & interview_date < mdy(05, 07, 2015) //coaltion
replace incumbent = cons if interview_date >= mdy(05, 07, 2015) //cons gvt
label var incumbent "Incumbent"


//change to percentagepoints
foreach var in voted cons labour ukip libdem incumbent{
replace `var' = `var' * 100
}


*-------------------------------------------------------------------------------------------------------------
* MECHANISM VARIABLES
*-------------------------------------------------------------------------------------------------------------


*** LIFE SATISFACTION
gen satlife = lfsato
replace satlife = sclfsato if satlife==. // lfsato missing for 2001
bysort id: ipolate satlife year, gen(satlife_imp)
label var satlife_imp "Life Satisfaction"
label define satlife 1 "Not at all" 4 "Not sat/dis" 7 "Completely sat" 
label values satlife satlife



*** SUPPORTS GOVERNMENT INTERVENTION
replace opsocc = (opsocc *(-1) ) + 6 //recode
replace opsoce = (opsoce * (-1) ) + 6 //recode
//impute
foreach mechvar in  opsocc opsoce {
bysort id: ipolate `mechvar' year, gen(`mechvar'_imp)
}
//PCA
pca opsocc_imp opsoce_imp
predict strong_gov_pca
label var strong_gov_pca "Supports Government Intervention"


*** SOCIAL PROGRESSIVENESS
replace scopfamd = (scopfamd * (-1) + 6) //recode
//IMPUTE
foreach mechvar in scopfama scopfamb scopfamd {
bysort id: ipolate `mechvar' year, gen(`mechvar'_imp)
}
// PCA
pca  scopfama_imp scopfamb_imp scopfamd_imp
predict social_progressive_pca
label var social_progressive_pca "Social Progressiveness"






*-------------------------------------------------------------------------------
save "replication_data/temporary_files/dataset_temp_1.dta", replace

